# Remove all variables from the R environment to create a fresh start
rm(list=ls())
# Load datasets
train1 <- read.csv("train_dataset01.csv")
train2 <- read.csv("train_dataset02.csv")
test <- read.csv("test_dataset.csv")
# Remove all variables from the R environment to create a fresh start
rm(list=ls())
# Load datasets
train1 <- read.csv("train_dataset01.csv")
train2 <- read.csv("train_dataset02.csv")
test <- read.csv("test_dataset.csv")
I tried printing out the summaries but they are too long to be immediately useful. Let’s try plots of variables for times with cyber attacks and times without
library(tidyverse)
[37m-- [1mAttaching packages[22m -------------------------------------------------------------------------------------------- tidyverse 1.2.1 --[39m
[37m[32mv[37m [34mggplot2[37m 3.1.0 [32mv[37m [34mpurrr [37m 0.2.5
[32mv[37m [34mtibble [37m 1.4.2 [32mv[37m [34mdplyr [37m 0.7.7
[32mv[37m [34mtidyr [37m 0.8.2 [32mv[37m [34mstringr[37m 1.3.1
[32mv[37m [34mreadr [37m 1.2.1 [32mv[37m [34mforcats[37m 0.3.0[39m
[37m-- [1mConflicts[22m ----------------------------------------------------------------------------------------------- tidyverse_conflicts() --
[31mx[37m [34mdplyr[37m::[32mfilter()[37m masks [34mstats[37m::filter()
[31mx[37m [34mdplyr[37m::[32mlag()[37m masks [34mstats[37m::lag()[39m
library(ggplot2)
train2[train2$ATT_FLAG == "True",] %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(binwidth = 25)
train2[train2$ATT_FLAG == "False",] %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value)) +
facet_wrap(~ key, scales = "free") +
geom_histogram(binwidth = 25)
I still cannot see much other than that distributions between observations with attacks and no attacks are different. This is a good sign for now. We can look at distribution of attacks and no-attacks and see what is our baseline F-scores in the training data set
distribution <- table(train2$ATT_FLAG)
distribution
False True
16155 1968
We can predict all True and see what is our score.
# Precision
precision <- distribution[2]/sum(distribution)
precision
True
0.1085913
# Recall
recall <- distribution[2]/distribution[2]
recall
True
1
# F score
(2 * precision * recall)/(precision + recall)
True
0.1959086
Maybe we can look at distribution of catogorical columns when grouped by their ATT_FLAG and see if we can identify significant features.
cats = c("STATUS_PU1", "STATUS_PU2", "STATUS_PU4", "STATUS_PU6", "STATUS_PU7", "STATUS_PU10", "STATUS_PU11", "STATUS_V2")
# Proportion table for obs with attacks
prop.table(sapply(train2[train2$ATT_FLAG == "True", cats], table), margin = 2)
STATUS_PU1 STATUS_PU2 STATUS_PU4 STATUS_PU6 STATUS_PU7 STATUS_PU10 STATUS_PU11 STATUS_V2
False 0 0.1661585 0.5797764 0.6371951 0.4629065 0.2535569 0.93140244 0.316565
True 1 0.8338415 0.4202236 0.3628049 0.5370935 0.7464431 0.06859756 0.683435
# Proportion table for obs without attacks
prop.table(sapply(train2[train2$ATT_FLAG == "False", cats], table), margin = 2)
STATUS_PU1 STATUS_PU2 STATUS_PU4 STATUS_PU6 STATUS_PU7 STATUS_PU10 STATUS_PU11 STATUS_V2
False 0.001176106 0.2742804 0.5738781 0.993871866 0.154008 0.1871866 0.9993809966 0.263324
True 0.998823894 0.7257196 0.4261219 0.006128134 0.845992 0.8128134 0.0006190034 0.736676
From the above two tables, it seems that we can ignore PU1, PU4 and PU11 since the distributions seem to be the same regardless of whether there is an attack or not. To be sure, we need to do a t-test. (We are definitely ignoring PU3, PU5, PU8 and PU9 since they only have one level)
for (col in cats) {
result <- t.test(
as.logical(train2[train2$ATT_FLAG == "True", col]),
as.logical(train2[train2$ATT_FLAG == "False", col])
)
print(col)
print(result)
}
[1] "STATUS_PU1"
Welch Two Sample t-test
data: as.logical(train2[train2$ATT_FLAG == "True", col]) and as.logical(train2[train2$ATT_FLAG == "False", col])
t = 4.3613, df = 16154, p-value = 1.301e-05
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.0006475293 0.0017046836
sample estimates:
mean of x mean of y
1.0000000 0.9988239
[1] "STATUS_PU2"
Welch Two Sample t-test
data: as.logical(train2[train2$ATT_FLAG == "True", col]) and as.logical(train2[train2$ATT_FLAG == "False", col])
t = 11.885, df = 2705.3, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.09028369 0.12596006
sample estimates:
mean of x mean of y
0.8338415 0.7257196
[1] "STATUS_PU4"
Welch Two Sample t-test
data: as.logical(train2[train2$ATT_FLAG == "True", col]) and as.logical(train2[train2$ATT_FLAG == "False", col])
t = -0.50029, df = 2472.7, p-value = 0.6169
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.02901730 0.01722056
sample estimates:
mean of x mean of y
0.4202236 0.4261219
[1] "STATUS_PU6"
Welch Two Sample t-test
data: as.logical(train2[train2$ATT_FLAG == "True", col]) and as.logical(train2[train2$ATT_FLAG == "False", col])
t = 32.848, df = 1979.6, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.3353816 0.3779718
sample estimates:
mean of x mean of y
0.362804878 0.006128134
[1] "STATUS_PU7"
Welch Two Sample t-test
data: as.logical(train2[train2$ATT_FLAG == "True", col]) and as.logical(train2[train2$ATT_FLAG == "False", col])
t = -26.639, df = 2224.9, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.3316382 -0.2861587
sample estimates:
mean of x mean of y
0.5370935 0.8459920
[1] "STATUS_PU10"
Welch Two Sample t-test
data: as.logical(train2[train2$ATT_FLAG == "True", col]) and as.logical(train2[train2$ATT_FLAG == "False", col])
t = -6.4575, df = 2368.2, p-value = 1.288e-10
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.08652527 -0.04621529
sample estimates:
mean of x mean of y
0.7464431 0.8128134
[1] "STATUS_PU11"
Welch Two Sample t-test
data: as.logical(train2[train2$ATT_FLAG == "True", col]) and as.logical(train2[train2$ATT_FLAG == "False", col])
t = 11.921, df = 1971.6, p-value < 2.2e-16
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
0.05679470 0.07916241
sample estimates:
mean of x mean of y
0.0685975610 0.0006190034
[1] "STATUS_V2"
Welch Two Sample t-test
data: as.logical(train2[train2$ATT_FLAG == "True", col]) and as.logical(train2[train2$ATT_FLAG == "False", col])
t = -4.8202, df = 2416.4, p-value = 1.523e-06
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.07490027 -0.03158171
sample estimates:
mean of x mean of y
0.683435 0.736676
The t-test shows us that among the catergorical features, we cannot reject the hypothesis that STATUS_PU4’s mean differs between observations with attacks and observations without attacks. i.e. we may want to consider removing STATUS_PU4 in our models.
We can do the same analysis for non-catergorical features. # TODO
We notice that the data has a data component. Perhaps we should look at it as a time series?
train2.ts <- ts(train2)
str(train2.ts)
Time-Series [1:18123, 1:45] from 1 to 18123: 1 2 3 4 5 6 7 8 9 10 ...
- attr(*, "dimnames")=List of 2
..$ : NULL
..$ : chr [1:45] "DATETIME" "LEVEL_T1" "LEVEL_T2" "LEVEL_T3" ...
plot.ts(train2.ts[,"ATT_FLAG"], ylab = "ATT_FLAG")
This looks interesting. From this plot we can see that attacks occur over a duration and there are 7 attacks in this set of training data. This perhaps also suggests that we cannot treat observations independently. If the previous observation is an attack, the next observation is more likely to be an attack as well since attack observations come together.
Another thing is, if we just throw a logistic regression model at this data, we are assuming that all 7 attacks have the same patterns. Perhaps the attacks are executed differently? Maybe I can use clustering techniques to see how many different types of attacks we have here. We have at most 7 possible different kinds of attacks. If we can identify that, maybe we can build one model to identify each type of attack. Which could improve our performance.
We can look at other features and see how they vary with time.
# for (col in colnames(train2.ts)) {
# plot.ts(train2.ts[,col], ylab=col)
# }
Ok there are way too many plots and my R studio is lagging now so I have to comment them out
From the plots we can see a few things. First, there doesn’t seem to be a decreasing or increasing trend in the features across time. This is good as things are pretty constant other than the attacks.
Second, different attacks correspond different anomalies in the features. This confirms our hypothesis that there are more than one type of attack.
Third, there are many features that just remain constant throughout so we can ignore them.
There is also a lot of noise in the data. We might want to find a way to clean that up.
ignore = c("LEVEL_T5", "FLOW_PU3", "FLOW_PU5", "FLOW_PU9", "STATUS_PU3", "STATUS_PU5", "STATUS_PU8", "STATUS_PU9")
train2.small <- train2[ , -which(names(train2) %in% ignore)]
train2.small.ts <- ts(train2.small)
# for (col in colnames(train2.small.ts)) {
# if (col != "DATETIME" & col != "ATT_FLAG") {
# plot.ts(train2.small.ts[,col], ylab=col, col=c("black"))
# par(new = TRUE)
# plot.ts(train2.small.ts[,"ATT_FLAG"], axes=FALSE, bty = "n", xlab = "", ylab = "", col="red")
# }
# }
From the above plots, it seems as if attacks 1 and 2 are very similar, 3 and 4 are very similar, and 6 and 7 are very similar. Attack 5 seems to be a slight variant from attacks 6 and 7. There seems to be 3-4 different types of attacks here.
We can use clustering techniques to verify this.
set.seed(1)
fit <- 0
for(k in 1:10){
clusterTrain <- kmeans(train2.small[,2:28], centers=k, nstart=20)
fit[k] <- clusterTrain$tot.withinss
}
plot(1:10, fit)
After performing kmeans on the non-catergorical data, it seems that there are around 4 to 5 clusters (using elbow method)
# Let's see the corresponding clusters with 5 clusters
# Add clusters to df
clusterTrain <- kmeans(train2.small[,2:28], centers=5, nstart=20)
train2.small$cluster <- clusterTrain$cluster
# Plot as time series to see if clustering captured the attack patterns
train2.small.ts <- ts(train2.small)
plot.ts(train2.small.ts[,"ATT_FLAG"], col="red")
par(new = TRUE)
plot.ts(train2.small.ts[,"cluster"], axes=FALSE, bty = "n", xlab = "", ylab = "", col="blue")
So from the plot above, we can see that kmeans totally didn’t work out. This is sad. How about Hierarchical clustering?
# With the function dist, we calculate the distance
distances <- dist(train2.small[,2:28], method="euclidean")
dim(train2.small)
[1] 18123 38
length(distances)
[1] 164212503
# Execute hierarchical clustering. We use Ward's distance method to find compact clusters.
clusterTrain <- hclust(distances, method="ward.D2")
# Plots the dendrogram. We have several movies, so the lists at the bottom cannot be read
plot(clusterTrain)
# Let's then cut the dendrogram into 5 clusters
clusters <- cutree(clusterTrain, k=5)
# Plot as time series to see if clustering captured the attack patterns
train2.small$cluster <- clusters
train2.small.ts <- ts(train2.small)
plot.ts(train2.small.ts[,"ATT_FLAG"], col="red")
par(new = TRUE)
plot.ts(train2.small.ts[,"cluster"], axes=FALSE, bty = "n", xlab = "", ylab = "", col="black")
Still terrible. Ugh, looks like I can’t use clustering to prove anything here. What if I do clustering only on obs with attacks?
# First label attack numbers in train data
train2.small$attack <- rep("0", times=nrow(train2.small))
attack = 0
prev = "False"
for (i in 1:nrow(train2.small)) {
cur = train2.small$ATT_FLAG[i]
if (cur == "True") {
if (prev == "False") {
attack = attack + 1
}
train2.small$attack[i] <- attack
}
prev = cur
}
train2.small.ts <- ts(train2.small)
plot.ts(train2.small.ts[,"ATT_FLAG"], col="red")
par(new = TRUE)
plot.ts(train2.small.ts[,"attack"], axes=FALSE, bty = "n", xlab = "", ylab = "", col="black")
Woohoo! Looks like I did it correctly. So now attacks are labelled 1 to 7. Now we see if attacks of the same type get clustered together.
# Cluster
attacks <- train2.small[train2.small$ATT_FLAG == "True",]
clusterAttacks <- kmeans(attacks[,2:28], centers=5, nstart=20)
attacks$cluster <- clusterAttacks$cluster
# Plot as timeseries
attacks.ts <- ts(attacks)
plot.ts(attacks.ts[,"attack"], col="red")
par(new = TRUE)
plot.ts(train2.small.ts[,"cluster"], axes=FALSE, bty = "n", xlab = "", ylab = "", col="blue")
Didn’t work out again. My explanation is, because noise is so much, points in the same “attack” or “non attack” regions end up being separated. Plus, the obs with attacks differ from obs without attacks in patterns that are most obvious in a time series plot. However, the clustering techniques we have used here only look at absolute values of the features of each observation, and are not able to pick up on patterns over time. I can think of two things to try here, 1. remove noise and 2. model this problem as a time series somehow.
Ok so I just remembered that to do clustering properly, I need to first normalize the data. And if I want to add categorical data in, I can replace True with 1 and False with 0. Let’s do that.
train2.norm <- train2.small
cats <- c("STATUS_PU1", "STATUS_PU2", "STATUS_PU4", "STATUS_PU6", "STATUS_PU7", "STATUS_PU10", "STATUS_PU11", "STATUS_V2")
# Change categorical data to numeric
for (cat in cats) {
train2.norm[, cat] <- as.numeric(as.logical(train2.norm[, cat]))
}
# normalize everything
for (name in names(train2.norm)) {
if (name != "DATETIME" & name != "ATT_FLAG" & name != "cluster" & name != "attack") {
train2.norm[, name] <- scale(train2.norm[, name])
}
}
Clustering
km <- kmeans(train2.norm[, 2:36], centers=5, nstart=20)
train2.norm$cluster <- km$cluster
# Plot as timeseries
train2.norm.ts <- ts(train2.norm)
plot.ts(train2.norm.ts[,"attack"], col="red")
par(new = TRUE)
plot.ts(train2.norm.ts[,"cluster"], axes=FALSE, bty = "n", xlab = "", ylab = "", col="blue")
Looks a bit better but still not very good :( Sad.
The plots were very noisy, making patterns harder to spot. We could try smoothing methods to de-noise the data
library(smooth)
Loading required package: greybox
Package "greybox", v0.3.2 loaded.
This is package "smooth", v2.4.6
Attaching package: 'smooth'
The following object is masked from 'package:greybox':
graphmaker
library(Mcomp)
Loading required package: forecast
# Example of a noisy plot
plot.ts(train2.small.ts[,"LEVEL_T7"], ylab="LEVEL_T7")
sma1 <- sma(train2.small.ts[,"LEVEL_T7"], h=18, silent = FALSE)
plot(sma1)
plot(forecast(sma1))
sma clearly didn’t work. I got to find something else.
Code moving average myself
library(zoo)
conti_feats <- c("LEVEL_T1", "LEVEL_T2", "LEVEL_T3", "LEVEL_T4", "LEVEL_T5", "LEVEL_T6", "LEVEL_T7", "PRESSURE_J280", "PRESSURE_J269", "PRESSURE_J300", "PRESSURE_J256", "PRESSURE_J289", "PRESSURE_J415", "PRESSURE_J302", "PRESSURE_J306", "PRESSURE_J307", "PRESSURE_J317", "PRESSURE_J14", "PRESSURE_J422", "FLOW_PU1", "FLOW_PU2", "FLOW_PU3", "FLOW_PU4", "FLOW_PU5", "FLOW_PU6", "FLOW_PU7", "FLOW_PU8", "FLOW_PU9", "FLOW_PU10", "FLOW_PU11", "FLOW_V2")
for (feat in conti_feats) {
new_col <- paste(feat, "_MA10", sep="")
temp <- rollapply(train2[, feat], width=10, by=1, FUN=mean)
train2[, new_col] <- c(temp, rep(0, times=9))
}
train2.ts <- ts(train2)
for (col in colnames(train2.ts)) {
if (col != "DATETIME" & col != "ATT_FLAG") {
plot.ts(train2.ts[,col], ylab=col, col=c("black"))
par(new = TRUE)
plot.ts(train2.ts[,"ATT_FLAG"], axes=FALSE, bty = "n", xlab = "", ylab = "", col="red")
}
}